## steps
# extract data on debtor level
# import debtor dataset
# inner join between the two datasets to remove redundant debtors and import size flag + nace_code
# exclude only observations that do not have any amount reported

# final datasets
import pandas as pd
import pyodbc
import os

# for NUTS
path_data = r'P:\ECB business areas\DGR\Staff\papouts\Research\Active\Firm Heterogeneities\Final datasets'
if not os.path.isdir(path_data):
    os.makedirs(path_data)



# debtor level
sql_conn = pyodbc.connect('DSN=DISC DP Impala 64bit',autocommit=True)

query = """
SELECT DISTINCT *

FROM (

SELECT           cntry_dbtr, 
                 dbtr_id,
                 nace_code,
                 SUM(outst_amnt_shr_sq) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS HHI,
                 COUNT(crdtr_id) OVER (PARTITION BY cntry_dbtr, dbtr_id) as nmbr_crdtrs,
                 mb_shr,
                 outst_amnt_ttl as ONA_dbtr
                 
FROM (
 SELECT DISTINCT cntry_dbtr, 
                 dbtr_id, 
                 nace_code,
                 outst_amnt_ttl, 
                 outst_amnt_shr_sq,
                 crdtr_id,
                 outst_amnt_per_crdtr,
                 outst_amnt_ttl_max,
                 outst_amnt_ttl_max/outst_amnt_ttl AS mb_shr                
 
 FROM (
 SELECT          cntry_dbtr, 
                 dbtr_id, 
                 crdtr_id,
                 cntrct_id,
                 instrmnt_id,
                 nace_code,
                 MAX(CASE WHEN default_aggr_dbtr_lvl = -99 AND default_dbtr_lvl = -99 THEN -99
                      WHEN (default_aggr_dbtr_lvl = 0 AND default_dbtr_lvl = -99) OR
                           (default_aggr_dbtr_lvl = 0 AND default_dbtr_lvl = 0) OR
                           (default_aggr_dbtr_lvl = -99 AND default_dbtr_lvl = 0) THEN 0
                      ELSE 1 END) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS dflt_stts_debtor,
                 outst_amnt_per_crdtr,
                 outst_amnt_ttl,
                 (outst_amnt_per_crdtr/outst_amnt_ttl)*(outst_amnt_per_crdtr/outst_amnt_ttl) AS outst_amnt_shr_sq,
                 MAX(outst_amnt_per_crdtr) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS outst_amnt_ttl_max
                 
 FROM (
 SELECT          cntry_dbtr,
                 dbtr_id, 
                 crdtr_id,
                 cntrct_id,
                 instrmnt_id,
                 t2.nace_code,
                 otstndng_nmnl_amnt_cv_c,
                 MAX(dflt_stts_instr_flag) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS default_aggr_dbtr_lvl,
                 MAX(dflt_stts_debtor_flag) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS default_dbtr_lvl,
                 SUM(otstndng_nmnl_amnt_cv_c) OVER (PARTITION BY cntry_dbtr, dbtr_id, crdtr_id) AS outst_amnt_per_crdtr,
                 SUM(otstndng_nmnl_amnt_cv_c) OVER (PARTITION BY cntry_dbtr, dbtr_id) AS outst_amnt_ttl

FROM            lab_org_dgr_fir.ac_instr_201912_kk t1
LEFT JOIN       lab_org_dgr_fir.ac_dbtr_201912_kk t2
ON              t1.dbtr_id=t2.entty_riad_id

WHERE           nvl(otstndng_nmnl_amnt_cv_c,0) + nvl(off_blnc_sht_amnt_cv,0) + nvl(cmmtmnt_incptn_cv,0) > 0                              
                AND prjct_fnnc_ln != 1
                
          ) tbl
 
      
                                ) tbl2
                                                
WHERE    dflt_stts_debtor = 0 

) tbl3
                                                
WHERE mb_shr IS NOT NULL   

) tbl4               

                """
                
dd_dbtr = pd.read_sql(query, sql_conn)
sql_conn.close()

# check
dd_dbtr.isna().sum()
ch = dd_dbtr.describe()

# import final debtors dataset
dbtr_f = pd.read_stata('dbtr_combined_dt.dta')

# merge datasets and take size flag
dd_merged = dd_dbtr.merge(dbtr_f[['cntry', 'entty_riad_id','sz_f']], how='left', left_on=['cntry_dbtr', 'dbtr_id'], right_on=['cntry', 'entty_riad_id'])
dd_merged.isna().sum()

# final dataset
dd_final = dd_merged[dd_merged.sz_f.isna() == False][['cntry_dbtr', 'dbtr_id', 'sz_f', 'nace_code', 'hhi', 'nmbr_crdtrs', 'mb_shr', 'ona_dbtr']].copy()

# export dataset
dd_final.to_stata('dbtr_lvl_dt_f.dta', write_index=False)

